In [1]:
"""Template for showing the results of the last experiment in MLFlow."""
import logging
import numpy as np
import helpsk as hlp
import pandas as pd
import plotly_express as px
from helpsk.utility import read_pickle, Timer
from helpsk.sklearn_eval import MLExperimentResults
from source.service.model_registry import ModelRegistry
%cd /code
from source.config import config # noqa: E402
logging.config.fileConfig(
"source/config/logging_to_file.conf",
defaults={'logfilename': 'output/log.log'},
disable_existing_loggers=False,
)
/usr/local/lib/python3.11/site-packages/IPython/core/magics/osm.py:417: UserWarning: using dhist requires you to install the `pickleshare` library. self.shell.db['dhist'] = compress_dhist(dhist)[-100:]
/code
Get Latest Experiment Run from MLFlow¶
In [2]:
registry = ModelRegistry(tracking_uri=config.experiment_server_url())
experiment = registry.get_experiment_by_name(exp_name=config.experiment_name())
logging.info(f"Experiment id: {experiment.last_run.exp_id}")
logging.info(f"Experiment name: {experiment.last_run.exp_name}")
logging.info(f"Run id: {experiment.last_run.run_id}")
logging.info(f"Metric(s): {experiment.last_run.metrics}")
2023-11-24 20:59:39 - INFO | Experiment id: 1
2023-11-24 20:59:39 - INFO | Experiment name: credit
2023-11-24 20:59:39 - INFO | Run id: 7c18722134d54a99991126ac0a1c2971
2023-11-24 20:59:39 - INFO | Metric(s): {'roc_auc': 0.753377535324465}
Last Run vs Production¶
What is the metric/performance from the model associated with the last run?
In [3]:
logging.info(f"last run metrics: {experiment.last_run.metrics}")
2023-11-24 20:59:39 - INFO | last run metrics: {'roc_auc': 0.753377535324465}
What is the metric/performance of the model in production?
In [4]:
production_run = registry.get_production_run(model_name=config.model_name())
logging.info(f"production run metrics: {production_run.metrics}")
2023-11-24 20:59:39 - INFO | production run metrics: {'roc_auc': 0.753377535324465}
Last Run¶
In [5]:
# underlying mlflow object
experiment.last_run.mlflow_entity
Out[5]:
<Run: data=<RunData: metrics={'roc_auc': 0.753377535324465}, params={'prep__numeric__imputer__transformer': 'SimpleImputer()',
'prep__numeric__pca__transformer': 'None',
'prep__numeric__scaler__transformer': 'None',
'prep__savings_status__savings_encoder__transformer': "OneHotEncoder(handle_unknown='ignore')"}, tags={'mlflow.log-model.history': '[{"run_id": "7c18722134d54a99991126ac0a1c2971", '
'"artifact_path": "model", "utc_time_created": '
'"2023-11-24 20:59:35.369562", "flavors": '
'{"python_function": {"model_path": "model.pkl", '
'"predict_fn": "predict", "loader_module": '
'"mlflow.sklearn", "python_version": "3.11.6", '
'"env": {"conda": "conda.yaml", "virtualenv": '
'"python_env.yaml"}}, "sklearn": '
'{"pickled_model": "model.pkl", '
'"sklearn_version": "1.3.2", '
'"serialization_format": "cloudpickle", "code": '
'null}}, "model_uuid": '
'"8c19cae6bc014da3b8e7ada65bda5e94", '
'"mlflow_version": "2.8.0", "model_size_bytes": '
'15618339}]',
'mlflow.note.content': '2023_11_24_20_59_08',
'mlflow.runName': '2023_11_24_20_59_08',
'mlflow.source.git.commit': '81a963fcbc4794b8b7bc6c330fc6b034760eb65d',
'mlflow.source.name': 'source/entrypoints/cli.py',
'mlflow.source.type': 'LOCAL',
'mlflow.user': 'root',
'type': 'BayesSearchCV'}>, info=<RunInfo: artifact_uri='/code/mlflow-artifact-root/1/7c18722134d54a99991126ac0a1c2971/artifacts', end_time=1700859577033, experiment_id='1', lifecycle_stage='active', run_id='7c18722134d54a99991126ac0a1c2971', run_name='2023_11_24_20_59_08', run_uuid='7c18722134d54a99991126ac0a1c2971', start_time=1700859548079, status='FINISHED', user_id='root'>, inputs=<RunInputs: dataset_inputs=[]>>
Load Training & Test Data Info¶
In [6]:
with Timer("Loading training/test datasets"):
X_train = experiment.last_run.download_artifact(artifact_name='x_train.pkl', read_from=read_pickle) # noqa
X_test = experiment.last_run.download_artifact(artifact_name='x_test.pkl', read_from=read_pickle) # noqa
y_train = experiment.last_run.download_artifact(artifact_name='y_train.pkl', read_from=read_pickle) # noqa
y_test = experiment.last_run.download_artifact(artifact_name='y_test.pkl', read_from=read_pickle) # noqa
Timer Started: Loading training/test datasets Timer Finished (0.01 seconds)
In [7]:
logging.info(f"training X shape: {X_train.shape}")
logging.info(f"training y length: {len(y_train)}")
logging.info(f"test X shape: {X_test.shape}")
logging.info(f"test y length: {len(y_test)}")
2023-11-24 20:59:39 - INFO | training X shape: (800, 20) 2023-11-24 20:59:39 - INFO | training y length: 800 2023-11-24 20:59:39 - INFO | test X shape: (200, 20) 2023-11-24 20:59:39 - INFO | test y length: 200
In [8]:
np.unique(y_train, return_counts=True)
Out[8]:
(array([0, 1]), array([559, 241]))
In [9]:
train_y_proportion = np.unique(y_train, return_counts=True)[1] \
/ np.sum(np.unique(y_train, return_counts=True)[1])
logging.info(f"balance of y in training: {train_y_proportion}")
2023-11-24 20:59:39 - INFO | balance of y in training: [0.69875 0.30125]
In [10]:
test_y_proportion = np.unique(y_test, return_counts=True)[1] \
/ np.sum(np.unique(y_test, return_counts=True)[1])
logging.info(f"balance of y in test: {test_y_proportion}")
2023-11-24 20:59:39 - INFO | balance of y in test: [0.705 0.295]
Cross Validation Results¶
Best Scores/Params¶
In [11]:
results = experiment.last_run.download_artifact(
artifact_name='experiment.yaml',
read_from=MLExperimentResults.from_yaml_file,
)
logging.info(f"Best Score: {results.best_score}")
logging.info(f"Best Params: {results.best_params}")
2023-11-24 20:59:39 - INFO | Best Score: 0.753377535324465
2023-11-24 20:59:39 - INFO | Best Params: {'model': 'RandomForestClassifier()', 'imputer': 'SimpleImputer()', 'scaler': 'None', 'pca': 'None', 'savings_status_encoder': 'OneHotEncoder()'}
In [12]:
# Best model from each model-type.
data = results.to_formatted_dataframe(return_style=False, include_rank=True)
data["model_rank"] = data.groupby("model")["roc_auc Mean"].rank(method="first", ascending=False)
data.query('model_rank == 1')
Out[12]:
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | model | C | max_features | max_depth | n_estimators | min_samples_split | ... | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | num_leaves | imputer | scaler | pca | savings_status_encoder | model_rank | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10 | 1 | 0.753 | 0.649 | 0.857 | RandomForestClassifier() | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | SimpleImputer() | None | None | OneHotEncoder() | 1.0 |
| 5 | 3 | 0.750 | 0.656 | 0.845 | ExtraTreesClassifier() | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | SimpleImputer() | None | None | OneHotEncoder() | 1.0 |
| 0 | 5 | 0.744 | 0.641 | 0.847 | LogisticRegression() | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | SimpleImputer() | StandardScaler() | None | OneHotEncoder() | 1.0 |
| 19 | 6 | 0.744 | 0.640 | 0.848 | XGBClassifier() | NaN | NaN | 1.0 | 1315.0 | NaN | ... | 0.530125 | 0.985952 | 0.001700 | 2.777691 | NaN | SimpleImputer(strategy='median') | None | None | SavingsStatusEncoder() | 1.0 |
| 23 | 12 | 0.732 | 0.613 | 0.850 | LGBMClassifier() | NaN | NaN | NaN | NaN | NaN | ... | 0.745611 | NaN | 0.877292 | 47.641776 | 497.0 | SimpleImputer(strategy='median') | None | None | OneHotEncoder() | 1.0 |
5 rows × 26 columns
In [13]:
results.to_formatted_dataframe(return_style=True,
include_rank=True,
num_rows=500)
Out[13]:
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | model | C | max_features | max_depth | n_estimators | min_samples_split | min_samples_leaf | max_samples | criterion | learning_rate | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | num_leaves | imputer | scaler | pca | savings_status_encoder |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.753 | 0.649 | 0.857 | RandomForestClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | None | OneHotEncoder() |
| 2 | 0.751 | 0.624 | 0.878 | RandomForestClassifier() | <NA> | 0.583 | 35.000 | 1,474.000 | 22.000 | 5.000 | 0.765 | entropy | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | None | PCA('mle') | OneHotEncoder() |
| 3 | 0.750 | 0.656 | 0.845 | ExtraTreesClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | None | OneHotEncoder() |
| 4 | 0.747 | 0.631 | 0.863 | RandomForestClassifier() | <NA> | 0.239 | 41.000 | 1,886.000 | 3.000 | 15.000 | 0.864 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | PCA('mle') | SavingsStatusEncoder() |
| 5 | 0.744 | 0.641 | 0.847 | LogisticRegression() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | StandardScaler() | None | OneHotEncoder() |
| 6 | 0.744 | 0.640 | 0.848 | XGBClassifier() | <NA> | <NA> | 1.000 | 1,315.000 | <NA> | <NA> | <NA> | <NA> | 0.022 | 17.000 | 0.716 | 0.530 | 0.986 | 0.002 | 2.778 | <NA> | SimpleImputer(strategy='median') | None | None | SavingsStatusEncoder() |
| 7 | 0.743 | 0.703 | 0.783 | LogisticRegression() | 0.000 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | MinMaxScaler() | None | SavingsStatusEncoder() |
| 8 | 0.739 | 0.606 | 0.872 | RandomForestClassifier() | <NA> | 0.911 | 74.000 | 1,265.000 | 39.000 | 17.000 | 0.751 | entropy | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | PCA('mle') | OneHotEncoder() |
| 9 | 0.739 | 0.626 | 0.851 | RandomForestClassifier() | <NA> | 0.445 | 87.000 | 1,244.000 | 33.000 | 27.000 | 0.795 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | PCA('mle') | OneHotEncoder() |
| 10 | 0.734 | 0.672 | 0.796 | ExtraTreesClassifier() | <NA> | 0.135 | 15.000 | 1,987.000 | 10.000 | 39.000 | 0.708 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='most_frequent') | None | None | SavingsStatusEncoder() |
| 11 | 0.732 | 0.606 | 0.859 | XGBClassifier() | <NA> | <NA> | 20.000 | 1,733.000 | <NA> | <NA> | <NA> | <NA> | 0.010 | 2.000 | 0.673 | 0.772 | 0.881 | 0.281 | 2.005 | <NA> | SimpleImputer() | None | PCA('mle') | OneHotEncoder() |
| 12 | 0.732 | 0.613 | 0.850 | LGBMClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 0.732 | 0.746 | <NA> | 0.877 | 47.642 | 497.000 | SimpleImputer(strategy='median') | None | None | OneHotEncoder() |
| 13 | 0.731 | 0.646 | 0.816 | LGBMClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 0.630 | 0.248 | <NA> | 5.830 | 44.985 | 347.000 | SimpleImputer(strategy='most_frequent') | None | None | OneHotEncoder() |
| 14 | 0.727 | 0.633 | 0.821 | ExtraTreesClassifier() | <NA> | 0.502 | 29.000 | 1,249.000 | 25.000 | 35.000 | 0.855 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | None | None | OneHotEncoder() |
| 15 | 0.725 | 0.590 | 0.860 | LogisticRegression() | 0.000 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='most_frequent') | StandardScaler() | PCA('mle') | SavingsStatusEncoder() |
| 16 | 0.723 | 0.585 | 0.862 | LogisticRegression() | 0.000 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | StandardScaler() | None | SavingsStatusEncoder() |
| 17 | 0.723 | 0.588 | 0.859 | LogisticRegression() | 0.000 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | StandardScaler() | PCA('mle') | SavingsStatusEncoder() |
| 18 | 0.723 | 0.633 | 0.813 | ExtraTreesClassifier() | <NA> | 0.537 | 2.000 | 1,275.000 | 14.000 | 47.000 | 0.805 | entropy | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | None | None | SavingsStatusEncoder() |
| 19 | 0.722 | 0.637 | 0.808 | ExtraTreesClassifier() | <NA> | 0.768 | 54.000 | 909.000 | 16.000 | 30.000 | 0.762 | entropy | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | None | PCA('mle') | SavingsStatusEncoder() |
| 20 | 0.721 | 0.622 | 0.819 | XGBClassifier() | <NA> | <NA> | 3.000 | 1,482.000 | <NA> | <NA> | <NA> | <NA> | 0.067 | 18.000 | 0.889 | 0.636 | 0.615 | 0.000 | 2.093 | <NA> | SimpleImputer(strategy='median') | None | None | OneHotEncoder() |
| 21 | 0.715 | 0.667 | 0.764 | LGBMClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 0.429 | 0.877 | <NA> | 13.997 | 35.364 | 388.000 | SimpleImputer(strategy='most_frequent') | None | None | OneHotEncoder() |
| 22 | 0.713 | 0.604 | 0.823 | LGBMClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | None | OneHotEncoder() |
| 23 | 0.701 | 0.587 | 0.814 | XGBClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | None | OneHotEncoder() |
| 24 | 0.691 | 0.638 | 0.745 | LGBMClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | 0.965 | 0.241 | <NA> | 19.400 | 3.771 | 281.000 | SimpleImputer(strategy='median') | None | PCA('mle') | OneHotEncoder() |
| 25 | 0.681 | 0.582 | 0.780 | XGBClassifier() | <NA> | <NA> | 4.000 | 1,961.000 | <NA> | <NA> | <NA> | <NA> | 0.271 | 3.000 | 0.671 | 0.797 | 0.696 | 0.000 | 2.150 | <NA> | SimpleImputer(strategy='median') | None | PCA('mle') | SavingsStatusEncoder() |
In [14]:
results.to_formatted_dataframe(query='model == "RandomForestClassifier()"', include_rank=True)
Out[14]:
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | max_features | max_depth | n_estimators | min_samples_split | min_samples_leaf | max_samples | criterion | imputer | pca | savings_status_encoder |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.753 | 0.649 | 0.857 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | OneHotEncoder() |
| 2 | 0.751 | 0.624 | 0.878 | 0.583 | 35.000 | 1,474.000 | 22.000 | 5.000 | 0.765 | entropy | SimpleImputer(strategy='median') | PCA('mle') | OneHotEncoder() |
| 3 | 0.747 | 0.631 | 0.863 | 0.239 | 41.000 | 1,886.000 | 3.000 | 15.000 | 0.864 | gini | SimpleImputer() | PCA('mle') | SavingsStatusEncoder() |
| 4 | 0.739 | 0.606 | 0.872 | 0.911 | 74.000 | 1,265.000 | 39.000 | 17.000 | 0.751 | entropy | SimpleImputer() | PCA('mle') | OneHotEncoder() |
| 5 | 0.739 | 0.626 | 0.851 | 0.445 | 87.000 | 1,244.000 | 33.000 | 27.000 | 0.795 | gini | SimpleImputer() | PCA('mle') | OneHotEncoder() |
In [15]:
results.to_formatted_dataframe(query='model == "LogisticRegression()"', include_rank=True)
Out[15]:
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | C | imputer | scaler | pca | savings_status_encoder |
|---|---|---|---|---|---|---|---|---|
| 1 | 0.744 | 0.641 | 0.847 | <NA> | SimpleImputer() | StandardScaler() | None | OneHotEncoder() |
| 2 | 0.743 | 0.703 | 0.783 | 0.000 | SimpleImputer() | MinMaxScaler() | None | SavingsStatusEncoder() |
| 3 | 0.725 | 0.590 | 0.860 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | PCA('mle') | SavingsStatusEncoder() |
| 4 | 0.723 | 0.585 | 0.862 | 0.000 | SimpleImputer(strategy='median') | StandardScaler() | None | SavingsStatusEncoder() |
| 5 | 0.723 | 0.588 | 0.859 | 0.000 | SimpleImputer(strategy='median') | StandardScaler() | PCA('mle') | SavingsStatusEncoder() |
BayesSearchCV Performance Over Time¶
In [16]:
results.plot_performance_across_trials(facet_by='model').show()
In [17]:
results.plot_performance_across_trials(query='model == "RandomForestClassifier()"').show()
Variable Performance Over Time¶
In [18]:
results.plot_parameter_values_across_trials(query='model == "RandomForestClassifier()"').show()
Scatter Matrix¶
In [19]:
# results.plot_scatter_matrix(query='model == "RandomForestClassifier()"',
# height=1000, width=1000).show()
Variable Performance - Numeric¶
In [20]:
results.plot_performance_numeric_params(query='model == "RandomForestClassifier()"',
height=800)
In [21]:
results.plot_parallel_coordinates(query='model == "RandomForestClassifier()"').show()
Variable Performance - Non-Numeric¶
In [22]:
results.plot_performance_non_numeric_params(query='model == "RandomForestClassifier()"').show()
In [23]:
results.plot_score_vs_parameter(
query='model == "RandomForestClassifier()"',
parameter='max_features',
size='max_depth',
color='savings_status_encoder',
)
In [24]:
# results.plot_parameter_vs_parameter(
# query='model == "XGBClassifier()"',
# parameter_x='colsample_bytree',
# parameter_y='learning_rate',
# size='max_depth'
# )
In [25]:
# results.plot_parameter_vs_parameter(
# query='model == "XGBClassifier()"',
# parameter_x='colsample_bytree',
# parameter_y='learning_rate',
# size='imputer'
# )
Last Run - Test Set Performance¶
In [26]:
last_model = experiment.last_run.download_artifact(
artifact_name='model/model.pkl',
read_from=read_pickle,
)
print(type(last_model.model))
<class 'sklearn.pipeline.Pipeline'>
In [27]:
last_model
Out[27]:
SklearnModelWrapper(model=Pipeline(steps=[('prep',
ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('imputer',
TransformerChooser(transformer=SimpleImputer())),
('scaler',
TransformerChooser()),
('pca',
TransformerChooser())]),
['duration',
'credit_amount',
'installment_commitment',
'residence_since',
'age',
'existing_credits',
'num_dependents']),
('n...
'employment',
'personal_status',
'other_parties',
'property_magnitude',
'other_payment_plans',
'housing',
'job',
'own_telephone',
'foreign_worker']),
('savings_status',
Pipeline(steps=[('savings_encoder',
TransformerChooser(transformer=OneHotEncoder(handle_unknown='ignore')))]),
['savings_status'])])),
('model',
RandomForestClassifier(n_estimators=500,
random_state=42))]))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SklearnModelWrapper(model=Pipeline(steps=[('prep',
ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('imputer',
TransformerChooser(transformer=SimpleImputer())),
('scaler',
TransformerChooser()),
('pca',
TransformerChooser())]),
['duration',
'credit_amount',
'installment_commitment',
'residence_since',
'age',
'existing_credits',
'num_dependents']),
('n...
'employment',
'personal_status',
'other_parties',
'property_magnitude',
'other_payment_plans',
'housing',
'job',
'own_telephone',
'foreign_worker']),
('savings_status',
Pipeline(steps=[('savings_encoder',
TransformerChooser(transformer=OneHotEncoder(handle_unknown='ignore')))]),
['savings_status'])])),
('model',
RandomForestClassifier(n_estimators=500,
random_state=42))]))Pipeline(steps=[('prep',
ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('imputer',
TransformerChooser(transformer=SimpleImputer())),
('scaler',
TransformerChooser()),
('pca',
TransformerChooser())]),
['duration', 'credit_amount',
'installment_commitment',
'residence_since', 'age',
'existing_credits',
'num_dependents']),
('non_numeric',
Pipeline(steps...
'employment',
'personal_status',
'other_parties',
'property_magnitude',
'other_payment_plans',
'housing', 'job',
'own_telephone',
'foreign_worker']),
('savings_status',
Pipeline(steps=[('savings_encoder',
TransformerChooser(transformer=OneHotEncoder(handle_unknown='ignore')))]),
['savings_status'])])),
('model',
RandomForestClassifier(n_estimators=500, random_state=42))])ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('imputer',
TransformerChooser(transformer=SimpleImputer())),
('scaler',
TransformerChooser()),
('pca',
TransformerChooser())]),
['duration', 'credit_amount',
'installment_commitment', 'residence_since',
'age', 'existing_credits',
'num_dependents']),
('non_numeric',
Pipeline(steps=[('encoder',
OneHotEncod...n='ignore'))]),
['checking_status', 'credit_history',
'purpose', 'employment', 'personal_status',
'other_parties', 'property_magnitude',
'other_payment_plans', 'housing', 'job',
'own_telephone', 'foreign_worker']),
('savings_status',
Pipeline(steps=[('savings_encoder',
TransformerChooser(transformer=OneHotEncoder(handle_unknown='ignore')))]),
['savings_status'])])['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
TransformerChooser(transformer=SimpleImputer())
SimpleImputer()
SimpleImputer()
TransformerChooser()
TransformerChooser()
['checking_status', 'credit_history', 'purpose', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
OneHotEncoder(handle_unknown='ignore')
['savings_status']
TransformerChooser(transformer=OneHotEncoder(handle_unknown='ignore'))
OneHotEncoder(handle_unknown='ignore')
OneHotEncoder(handle_unknown='ignore')
RandomForestClassifier(n_estimators=500, random_state=42)
In [28]:
test_predictions = last_model.predict(X_test)
test_predictions[0:10]
Out[28]:
array([0.408, 0.522, 0.678, 0.404, 0.088, 0.454, 0.092, 0.492, 0.176,
0.232])
In [29]:
evaluator = hlp.sklearn_eval.TwoClassEvaluator(
actual_values=y_test,
predicted_scores=test_predictions,
score_threshold=0.37,
)
In [30]:
evaluator.plot_actual_vs_predict_histogram()
In [31]:
evaluator.plot_confusion_matrix()
In [32]:
evaluator.all_metrics_df(return_style=True,
dummy_classifier_strategy=['prior', 'constant'],
round_by=3)
Out[32]:
| Score | Dummy (prior) | Dummy (constant) | Explanation | |
|---|---|---|---|---|
| AUC | 0.815 | 0.500 | 0.500 | Area under the ROC curve (true pos. rate vs false pos. rate); ranges from 0.5 (purely random classifier) to 1.0 (perfect classifier) |
| True Positive Rate | 0.729 | 0.000 | 1.000 | 72.9% of positive instances were correctly identified.; i.e. 43 "Positive Class" labels were correctly identified out of 59 instances; a.k.a Sensitivity/Recall |
| True Negative Rate | 0.801 | 1.000 | 0.000 | 80.1% of negative instances were correctly identified.; i.e. 113 "Negative Class" labels were correctly identified out of 141 instances |
| False Positive Rate | 0.199 | 0.000 | 1.000 | 19.9% of negative instances were incorrectly identified as positive; i.e. 28 "Negative Class" labels were incorrectly identified as "Positive Class", out of 141 instances |
| False Negative Rate | 0.271 | 1.000 | 0.000 | 27.1% of positive instances were incorrectly identified as negative; i.e. 16 "Positive Class" labels were incorrectly identified as "Negative Class", out of 59 instances |
| Positive Predictive Value | 0.606 | 0.000 | 0.295 | When the model claims an instance is positive, it is correct 60.6% of the time; i.e. out of the 71 times the model predicted "Positive Class", it was correct 43 times; a.k.a precision |
| Negative Predictive Value | 0.876 | 0.705 | 0.000 | When the model claims an instance is negative, it is correct 87.6% of the time; i.e. out of the 129 times the model predicted "Negative Class", it was correct 113 times |
| F1 Score | 0.662 | 0.000 | 0.456 | The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. |
| Precision/Recall AUC | 0.660 | 0.295 | 0.295 | Precision/Recall AUC is calculated with `average_precision` which summarizes a precision-recall curve as the weighted mean of precisions achieved at each threshold. See sci-kit learn documentation for caveats. |
| Accuracy | 0.780 | 0.705 | 0.295 | 78.0% of instances were correctly identified |
| Error Rate | 0.220 | 0.295 | 0.705 | 22.0% of instances were incorrectly identified |
| % Positive | 0.295 | 0.295 | 0.295 | 29.5% of the data are positive; i.e. out of 200 total observations; 59 are labeled as "Positive Class" |
| Total Observations | 200 | 200 | 200 | There are 200 total observations; i.e. sample size |
In [33]:
evaluator.plot_roc_auc_curve().show()
In [34]:
evaluator.plot_precision_recall_auc_curve().show()
In [35]:
evaluator.plot_threshold_curves(score_threshold_range=(0.1, 0.7)).show()
In [36]:
evaluator.plot_precision_recall_tradeoff(score_threshold_range=(0.1, 0.6)).show()
In [37]:
evaluator.calculate_lift_gain(return_style=True)
/usr/local/lib/python3.11/site-packages/helpsk/sklearn_eval.py:2480: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
Out[37]:
| Gain | Lift | |
|---|---|---|
| Percentile | ||
| 5 | 0.14 | 2.71 |
| 10 | 0.22 | 2.20 |
| 15 | 0.36 | 2.37 |
| 20 | 0.51 | 2.54 |
| 25 | 0.54 | 2.17 |
| 30 | 0.68 | 2.26 |
| 35 | 0.73 | 2.08 |
| 40 | 0.76 | 1.91 |
| 45 | 0.76 | 1.69 |
| 50 | 0.81 | 1.63 |
| 55 | 0.85 | 1.54 |
| 60 | 0.85 | 1.41 |
| 65 | 0.88 | 1.36 |
| 70 | 0.90 | 1.28 |
| 75 | 0.95 | 1.27 |
| 80 | 0.97 | 1.21 |
| 85 | 0.98 | 1.16 |
| 90 | 0.98 | 1.09 |
| 95 | 1.00 | 1.05 |
| 100 | 1.00 | 1.00 |
Production Model - Test Set Performance¶
In [38]:
production_model = production_run.download_artifact(
artifact_name='model/model.pkl',
read_from=read_pickle,
)
print(type(production_model.model))
<class 'sklearn.pipeline.Pipeline'>
In [39]:
production_model
Out[39]:
SklearnModelWrapper(model=Pipeline(steps=[('prep',
ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('imputer',
TransformerChooser(transformer=SimpleImputer())),
('scaler',
TransformerChooser()),
('pca',
TransformerChooser())]),
['duration',
'credit_amount',
'installment_commitment',
'residence_since',
'age',
'existing_credits',
'num_dependents']),
('n...
'employment',
'personal_status',
'other_parties',
'property_magnitude',
'other_payment_plans',
'housing',
'job',
'own_telephone',
'foreign_worker']),
('savings_status',
Pipeline(steps=[('savings_encoder',
TransformerChooser(transformer=OneHotEncoder(handle_unknown='ignore')))]),
['savings_status'])])),
('model',
RandomForestClassifier(n_estimators=500,
random_state=42))]))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SklearnModelWrapper(model=Pipeline(steps=[('prep',
ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('imputer',
TransformerChooser(transformer=SimpleImputer())),
('scaler',
TransformerChooser()),
('pca',
TransformerChooser())]),
['duration',
'credit_amount',
'installment_commitment',
'residence_since',
'age',
'existing_credits',
'num_dependents']),
('n...
'employment',
'personal_status',
'other_parties',
'property_magnitude',
'other_payment_plans',
'housing',
'job',
'own_telephone',
'foreign_worker']),
('savings_status',
Pipeline(steps=[('savings_encoder',
TransformerChooser(transformer=OneHotEncoder(handle_unknown='ignore')))]),
['savings_status'])])),
('model',
RandomForestClassifier(n_estimators=500,
random_state=42))]))Pipeline(steps=[('prep',
ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('imputer',
TransformerChooser(transformer=SimpleImputer())),
('scaler',
TransformerChooser()),
('pca',
TransformerChooser())]),
['duration', 'credit_amount',
'installment_commitment',
'residence_since', 'age',
'existing_credits',
'num_dependents']),
('non_numeric',
Pipeline(steps...
'employment',
'personal_status',
'other_parties',
'property_magnitude',
'other_payment_plans',
'housing', 'job',
'own_telephone',
'foreign_worker']),
('savings_status',
Pipeline(steps=[('savings_encoder',
TransformerChooser(transformer=OneHotEncoder(handle_unknown='ignore')))]),
['savings_status'])])),
('model',
RandomForestClassifier(n_estimators=500, random_state=42))])ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('imputer',
TransformerChooser(transformer=SimpleImputer())),
('scaler',
TransformerChooser()),
('pca',
TransformerChooser())]),
['duration', 'credit_amount',
'installment_commitment', 'residence_since',
'age', 'existing_credits',
'num_dependents']),
('non_numeric',
Pipeline(steps=[('encoder',
OneHotEncod...n='ignore'))]),
['checking_status', 'credit_history',
'purpose', 'employment', 'personal_status',
'other_parties', 'property_magnitude',
'other_payment_plans', 'housing', 'job',
'own_telephone', 'foreign_worker']),
('savings_status',
Pipeline(steps=[('savings_encoder',
TransformerChooser(transformer=OneHotEncoder(handle_unknown='ignore')))]),
['savings_status'])])['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
TransformerChooser(transformer=SimpleImputer())
SimpleImputer()
SimpleImputer()
TransformerChooser()
TransformerChooser()
['checking_status', 'credit_history', 'purpose', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
OneHotEncoder(handle_unknown='ignore')
['savings_status']
TransformerChooser(transformer=OneHotEncoder(handle_unknown='ignore'))
OneHotEncoder(handle_unknown='ignore')
OneHotEncoder(handle_unknown='ignore')
RandomForestClassifier(n_estimators=500, random_state=42)
In [40]:
test_predictions = production_model.predict(X_test)
test_predictions[0:10]
Out[40]:
array([0.408, 0.522, 0.678, 0.404, 0.088, 0.454, 0.092, 0.492, 0.176,
0.232])
In [41]:
evaluator = hlp.sklearn_eval.TwoClassEvaluator(
actual_values=y_test,
predicted_scores=test_predictions,
score_threshold=0.37,
)
In [42]:
evaluator.plot_actual_vs_predict_histogram()
In [43]:
evaluator.plot_confusion_matrix()
In [44]:
evaluator.all_metrics_df(return_style=True,
dummy_classifier_strategy=['prior', 'constant'],
round_by=3)
Out[44]:
| Score | Dummy (prior) | Dummy (constant) | Explanation | |
|---|---|---|---|---|
| AUC | 0.815 | 0.500 | 0.500 | Area under the ROC curve (true pos. rate vs false pos. rate); ranges from 0.5 (purely random classifier) to 1.0 (perfect classifier) |
| True Positive Rate | 0.729 | 0.000 | 1.000 | 72.9% of positive instances were correctly identified.; i.e. 43 "Positive Class" labels were correctly identified out of 59 instances; a.k.a Sensitivity/Recall |
| True Negative Rate | 0.801 | 1.000 | 0.000 | 80.1% of negative instances were correctly identified.; i.e. 113 "Negative Class" labels were correctly identified out of 141 instances |
| False Positive Rate | 0.199 | 0.000 | 1.000 | 19.9% of negative instances were incorrectly identified as positive; i.e. 28 "Negative Class" labels were incorrectly identified as "Positive Class", out of 141 instances |
| False Negative Rate | 0.271 | 1.000 | 0.000 | 27.1% of positive instances were incorrectly identified as negative; i.e. 16 "Positive Class" labels were incorrectly identified as "Negative Class", out of 59 instances |
| Positive Predictive Value | 0.606 | 0.000 | 0.295 | When the model claims an instance is positive, it is correct 60.6% of the time; i.e. out of the 71 times the model predicted "Positive Class", it was correct 43 times; a.k.a precision |
| Negative Predictive Value | 0.876 | 0.705 | 0.000 | When the model claims an instance is negative, it is correct 87.6% of the time; i.e. out of the 129 times the model predicted "Negative Class", it was correct 113 times |
| F1 Score | 0.662 | 0.000 | 0.456 | The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. |
| Precision/Recall AUC | 0.660 | 0.295 | 0.295 | Precision/Recall AUC is calculated with `average_precision` which summarizes a precision-recall curve as the weighted mean of precisions achieved at each threshold. See sci-kit learn documentation for caveats. |
| Accuracy | 0.780 | 0.705 | 0.295 | 78.0% of instances were correctly identified |
| Error Rate | 0.220 | 0.295 | 0.705 | 22.0% of instances were incorrectly identified |
| % Positive | 0.295 | 0.295 | 0.295 | 29.5% of the data are positive; i.e. out of 200 total observations; 59 are labeled as "Positive Class" |
| Total Observations | 200 | 200 | 200 | There are 200 total observations; i.e. sample size |
In [45]:
evaluator.plot_roc_auc_curve().show()
In [46]:
evaluator.plot_precision_recall_auc_curve().show()
In [47]:
evaluator.plot_threshold_curves(score_threshold_range=(0.1, 0.7)).show()
In [48]:
evaluator.plot_precision_recall_tradeoff(score_threshold_range=(0.1, 0.6)).show()
In [49]:
evaluator.calculate_lift_gain(return_style=True)
/usr/local/lib/python3.11/site-packages/helpsk/sklearn_eval.py:2480: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
Out[49]:
| Gain | Lift | |
|---|---|---|
| Percentile | ||
| 5 | 0.14 | 2.71 |
| 10 | 0.22 | 2.20 |
| 15 | 0.36 | 2.37 |
| 20 | 0.51 | 2.54 |
| 25 | 0.54 | 2.17 |
| 30 | 0.68 | 2.26 |
| 35 | 0.73 | 2.08 |
| 40 | 0.76 | 1.91 |
| 45 | 0.76 | 1.69 |
| 50 | 0.81 | 1.63 |
| 55 | 0.85 | 1.54 |
| 60 | 0.85 | 1.41 |
| 65 | 0.88 | 1.36 |
| 70 | 0.90 | 1.28 |
| 75 | 0.95 | 1.27 |
| 80 | 0.97 | 1.21 |
| 85 | 0.98 | 1.16 |
| 90 | 0.98 | 1.09 |
| 95 | 1.00 | 1.05 |
| 100 | 1.00 | 1.00 |
Feature Importance¶
In [50]:
try:
importances = production_model.model['model'].feature_importances_
feature_names = [
x.replace('non_numeric__', '').replace('numeric__', '')
for x in production_model.model[:-1].get_feature_names_out()
]
feature_importances = sorted(
zip(feature_names, importances, strict=True),
key=lambda x: x[1],
reverse=False,
)
fig = px.bar(
pd.DataFrame(feature_importances, columns=['feature', 'importance']).tail(20),
y='feature',
x='importance',
orientation='h',
height=700,
width=800,
title='Feature Importances of Production Model',
)
fig.show()
except: # noqa
print("Error calculating feature importances.")